
clear all
set more off

global dirin1 = "RAW" // input folder
global dirout = "INTERMEDIATE" // place to save collapsed files

****** PRODUCTS **********

// Run "replace_dquotes.py" with the following option, "products.tsv products_transformed.tsv"
// At Windows, use Spyder, go to run settings, add command line options

insheet using "RAW/products_transformed.tsv", tab clear

label variable upc "UPC code of product"
label variable upc_ver_uc "UPC code version"
label variable upc_descr "Product description"
label variable product_module_code "Product module code"
label variable product_module_descr "Product module description"
label variable product_group_code "Product group code"
label variable product_group_descr "Product group description"
label variable department_code "Department code"
label variable department_descr "Department description"
label variable brand_code "Nielsen-assigned codes for each brand"
label variable brand_descr "Brand description"
label variable multi "Number of units in a multipack"
label variable size1_code "Size code"
label variable size1_amount "Numeric size of the product"
label variable size1_units "Description of the unit of measure for the size1_amount"
label variable dataset_found_uc "whether the product is found in HMS, RMS, or both"
label variable size1_change_flag_uc "indicates a product size change occurred at some point in time in RMS"

compress

* name units properly
gen size1_units_orig = size1_units
tab size1_units
replace size1_units = "Cubic Foot" if size1_units == "CF"
replace size1_units = "Count" if size1_units == "CT"
replace size1_units = "Expired" if size1_units == "EXP"
replace size1_units = "Foot" if size1_units == "FT"
replace size1_units = "Liter" if size1_units == "LI"
replace size1_units = "Milliliter" if size1_units == "ML"
replace size1_units = "Ounce" if size1_units == "OZ"
replace size1_units = "Pound" if size1_units == "PO"
replace size1_units = "Quart" if size1_units == "QT"
replace size1_units = "Square Foot" if size1_units == "SQ FT"
replace size1_units = "Yard" if size1_units == "YD"

*collapse units
gen size1_amount_orig = size1_amount
replace size1_amount = size1_amount*16 if size1_units=="Pound"
replace size1_units = "Ounce" if size1_units=="Pound"
replace size1_amount = size1_amount*1000 if size1_units=="Liter"
replace size1_units = "Milliliter" if size1_units=="Liter"
replace size1_amount = size1_amount*3 if size1_units=="Yard"
replace size1_units = "Foot" if size1_units=="Yard"
replace size1_amount = size1_amount*946.353 if size1_units=="Quart"
replace size1_units = "Milliliter" if size1_units=="Quart"
replace size1_amount = size1_amount*957.506 if size1_units=="Cubic Foot"
replace size1_units = "Ounce" if size1_units=="Cubic Foot"
replace size1_amount = size1_amount*0.0338 if size1_units=="Milliliter"
replace size1_units = "Ounce" if size1_units=="Milliliter"
tab size1_units

rename size1_units size1_units_desc
encode size1_units_desc, gen(size1_units)

compress
sort upc upc_ver_uc

// measurement error: missing unit
drop if size1_units_desc=="" // 131 UPCs deleted

// In a product module, keep only set of UPCs measured by the most frequent units
bys product_module_code size1_units: gen num_upc_module_unit=_N
bys product_module_code: egen num_upc_module_max=max(num_upc_module_unit)
drop if num_upc_module_unit<num_upc_module_max 

// number of units in each product module, it should be 1
by product_module_code size1_units, sort: gen num_units = _n == 1
by product_module_code: replace num_units = sum(num_units)
by product_module_code: replace num_units = num_units[_N]
lab var num_units "number of different size units in product module category"
tab num_units

drop num_upc_module_unit num_upc_module_max num_units
drop if dataset_found_uc==""

save "INTERMEDIATE/products_clean.dta",replace


* Import product extra files

// Products extra, merge all the files into one large file 
tempfile pe
local year_start = 2006
local year_end = 2015

forvalues year = `year_start'/`year_end' {

insheet using "RAW/products_extra_`year'.tsv", tab clear

if `year' == 2009 {
tostring dosage_descr gender_descr target_skin_condition_descr use_descr, replace force
}
if `year' == 2010 {
tostring flavor_descr, replace force
}

if `year' != `year_start' { 
append using `pe'
}
save `pe', replace 
}

label variable upc "UPC code of product"
label variable upc_ver_uc "UPC code version"
label variable panel_year "Panel file with panelist data"
label variable flavor_code "Flavor code"
label variable flavor_descr "Flavor description"
label variable form_code "Form code"
label variable form_descr "Form description"
label variable formula_code "Formula code"
label variable formula_descr "Formula description"
label variable container_code "Container code"
label variable container_descr "Container description"
label variable salt_content_code "Salt content code"
label variable salt_content_descr "Salt content description"
label variable style_code "Style code"
label variable style_descr "Style description"
label variable type_code "Type code"
label variable type_descr "Type description"
label variable product_code "Product code"
label variable product_descr "Product description"
label variable variety_code "Variety code"
label variable variety_descr "Variety description"
label variable organic_claim_code "Organic claimed by product code"
label variable organic_claim_descr "Organic claimed by product description"
label variable usda_organic_seal_code "USDA Organic seal on product code"
label variable usda_organic_seal_descr "USDA Organic seal on product description"
label variable common_consumer_name_code "Commonly used name code"
label variable common_consumer_name_descr "CCN description"
label variable strength_code "Strength code"
label variable strength_descr "Strength description"
label variable scent_code "Scent code"
label variable scent_descr "Scent description"
label variable dosage_code "Dosage code"
label variable dosage_descr "Dosage description"
label variable gender_code "Gender code"
label variable gender_descr "Gender description"
label variable target_skin_condition_code "Skin condition targeted code"
label variable target_skin_condition_descr "Skin condition targeted description"
label variable use_code "Intended to be used by code"
label variable use_descr "Used by description"
label variable size2_code "Size of product code"
label variable size2_amount "Amount in product size"
label variable size2_units "Unit of amount of product"

compress

foreach i in flavor form formula salt_content style type product variety organic_claim usda_organic_seal common_consumer_name ///
	strength scent dosage gender target_skin_condition use size2 container {
		replace `i'_code=0 if `i'_code==.
}

replace size2_amount=0 if size2_amount==.

* name units properly
gen size2_units_orig = size2_units
tab size2_units
replace size2_units = "Cubic Foot" if size2_units == "CF"
replace size2_units = "Count" if size2_units == "CT"
replace size2_units = "Expired" if size2_units == "EXP"
replace size2_units = "Foot" if size2_units == "FT"
replace size2_units = "Liter" if size2_units == "LI"
replace size2_units = "Milliliter" if size2_units == "ML"
replace size2_units = "Ounce" if size2_units == "OZ"
replace size2_units = "Pound" if size2_units == "PO" |  size2_units == "POUND" |  size2_units == "POUNDS"
replace size2_units = "Quart" if size2_units == "QT" |  size2_units == "QUART" |  size2_units == "QUARTS"
replace size2_units = "Square Foot" if size2_units == "SQ FT"  |  size2_units == "SF" |  size2_units == "SQ SF"
replace size2_units = "Yard" if size2_units == "YD"

*collapse units
gen size2_amount_orig = size2_amount
replace size2_amount = size2_amount*16 if size2_units=="Pound"
replace size2_units = "Ounce" if size2_units=="Pound"
replace size2_amount = size2_amount*1000 if size2_units=="Liter"
replace size2_units = "Milliliter" if size2_units=="Liter"
replace size2_amount = size2_amount*3 if size2_units=="Yard"
replace size2_units = "Foot" if size2_units=="Yard"
replace size2_amount = size2_amount*946.353 if size2_units=="Quart"
replace size2_units = "Milliliter" if size2_units=="Quart"
replace size2_amount = size2_amount*957.506 if size2_units=="Cubic Foot"
replace size2_units = "Ounce" if size2_units=="Cubic Foot"
replace size2_amount = size2_amount*0.0338 if size2_units=="Milliliter"
replace size2_units = "Ounce" if size2_units=="Milliliter"
tab size2_units

rename size2_units size2_units_desc
encode size2_units_desc, gen(size2_units)

duplicates drop upc upc_ver_uc flavor_code flavor_descr form_code form_descr formula_code formula_descr container_code container_descr ///
	salt_content_code salt_content_descr style_code style_descr type_code type_descr product_code product_descr variety_code variety_descr ///
	organic_claim_code organic_claim_descr usda_organic_seal_code usda_organic_seal_descr common_consumer_name_code common_consumer_name_descr ///
	strength_code strength_descr scent_code scent_descr dosage_code dosage_descr gender_code gender_descr target_skin_condition_code  ///
	target_skin_condition_descr use_code use_descr size2_code size2_amount size2_units, force

*keep only most recent year;
sort upc upc_ver_uc panel_year
by upc upc_ver_uc: keep if _n==_N

// clean up organic descriptions;
cap drop organic_clean
gen organic_clean = 0
replace organic_clean = 1 if usda_organic_seal_code==4 | usda_organic_seal_code==30 | usda_organic_seal_code==123 | usda_organic_seal_code==432 | ///
							 usda_organic_seal_code==258 | usda_organic_seal_code==373
cap label drop organic_clean
label define organic_clean 0 "not organic" 1 "organic"
label values organic_clean organic_clean

// clean up scent descriptions
gen scent_clean = scent_code
replace scent_clean=0  if scent_code==1428 | scent_code==302 | scent_code==219 | ///
						  scent_code==3613 | scent_code==226 | scent_code==3667	

save "INTERMEDIATE/products_extra.dta", replace

use "INTERMEDIATE/products_clean.dta", clear 
merge 1:1 upc upc_ver_uc using "INTERMEDIATE/products_extra.dta"
keep if _merge==3 
drop _merge
save "INTERMEDIATE/products_clean_extras.dta", replace


****************************************
** 1. Prepare Three Datasets on Store, Product, and Product Module
****************************************
* note that we work on the permanent folder (dirin1)

** Store: Keep the balanced stores
local year "2006 2007 2008 2009 2010 2011 2012 2013 2014 2015"
foreach y of local year {
insheet using "RAW/`y'/Annual_Files/stores_`y'.tsv", tab clear
keep store_code_uc dma_code year
save "INTERMEDIATE/store_`y'.dta",replace
}

use "INTERMEDIATE/store_2006.dta",clear
local year "2007 2008 2009 2010 2011 2012 2013 2014 2015"
foreach y of local year {
append using "INTERMEDIATE/store_`y'.dta"
}
bys store_code_uc: gen dup=[_N]
tab dup
keep if dup==10
keep store_code_uc
duplicates drop // 31,678 strongly balanced stores
save "INTERMEDIATE/store_balanced.dta",replace

** Product: RMS version
local year "2006 2007 2008 2009 2010 2011 2012 2013 2014 2015"
foreach y of local year {
insheet using "RAW/`y'/Annual_Files/rms_versions_`y'.tsv", tab clear
keep upc upc_ver_uc
save "INTERMEDIATE/rms_versions_`y'.dta",replace
}

** Product module: list of product_group_code and product_module_code for the loop
// use Statransfer for dta file first
use "INTERMEDIATE/Product_Hierarchy_10.26.2016.dta",clear
drop if product_group_code==.
drop if Deferred__Please_see_documentati!=""
drop if Scanner_data_years_no_data!=""
drop if Available_only_in_Consumer_Panel!=""
drop if May_not_contain_data_ANY_years_i!=""
drop if product_module_code==9999 // Unclassified UPCs
gen group=string(product_group_code,"%04.0f")
gen module=string(product_module_code,"%04.0f")
sort group module
distinct
save "INTERMEDIATE/Product_Hierarchy_clean.dta",replace


****************************************
** 2. Import Raw Data and Sum at UPC-quarter level
****************************************

use "INTERMEDIATE/Product_Hierarchy_clean.dta",clear

local year "2006 2007 2008 2009 2010 2011 2012 2013 2014 2015"

levelsof group, local(group_new)
foreach g of local group_new {
use "INTERMEDIATE/Product_Hierarchy_clean.dta",clear
drop if module=="7203" // Insecticide-Flying Insect-Liquid, no obs. in 2015
drop if module=="8605" // Ipecac Product, no obs. in 2014 and 2015
levelsof module if group=="`g'", local(module_new)
foreach m of local module_new {
di "`g'"
di "`m'"

foreach y of local year {
di "`y'"
insheet using "$dirin/`y'/Movement_Files/`g'_`y'/`m'_`y'.tsv", tab clear

capture drop feature display
capture destring store_code_uc, replace
capture destring upc, replace
capture destring units, replace
capture destring prmult, replace
capture destring price, replace
replace price=price/prmult

drop if price==.|units==. // data is very clean; but just in case
drop if price<=0.01 // Nielsen does not allow prices to be zero. If a good is free, then it is coded/converted to 1 cent.

* identify quarters
capture tostring week_end, replace
gen year = substr(week_end,1,4)
drop if year!="`y'"
gen month = substr(week_end,5,2)
destring year, replace
destring month, replace
gen quarter=ceil(month/3)

* Variables
gen revenue=price*units
gen num=1

collapse (sum) revenue units num, by (year quarter upc store_code_uc) fast
compress
save "INTERMEDIATE/upc_quarter_store/`m'_`y'.dta",replace
}
}
}


****************************************
** 3. GENERATE UPC_QUARTER_ACTUAL-BALANCED.DTA
****************************************

clear all
set more off
use "INTERMEDIATE/Product_Hierarchy_clean.dta",clear

local year "2006 2007 2008 2009 2010 2011 2012 2013 2014 2015"

tempfile tr
local i=1
levelsof group, local(group_new)
foreach g of local group_new {
use "INTERMEDIATE/Product_Hierarchy_clean.dta",clear
drop if module=="7203" // Insecticide-Flying Insect-Liquid, no obs. in 2015
drop if module=="8605" // Ipecac Product, no obs. in 2014 and 2015

drop if module=="7189" // PESTICIDES - TOMATO & VEGETABLE, no balanced stores
drop if module=="1304" // CAT FOOD - MOIST TYPE, no balanced stores
drop if module=="1434" // PUDDING - PLUM - CANNED, no balanced stores
drop if module=="2691" // FROZEN MEAT - VEAL, no balanced stores
drop if module=="7742" // HOUSEHOLD AREA ALLERGEN CONTROL, no balanced stores
drop if module=="8616" // SUN EXPOSURE DETECTOR PRODUCT TOPICAL, no balanced stores
drop if module=="7500" // BEER & WINE MAKING KITS, no obs. after balanced stores

levelsof module if group=="`g'", local(module_new)
foreach m of local module_new {
foreach y of local year {
use "INTERMEDIATE/upc_quarter_store_1/`m'_`y'.dta",clear
merge m:1 store_code_uc using "INTERMEDIATE/store/`m'.dta", keep(match) nogenerate ///
merge m:1 upc using "INTERMEDIATE/rms_versions_`y'.dta", keep(match) nogenerate ///
keepusing (upc_ver_uc)
merge m:1 upc upc_ver_uc using "INTERMEDIATE/products_clean.dta", keep(match) nogenerate ///
keepusing(multi size1_amount)
gen quantity=units*multi*size1_amount
gen num_store=1
collapse (sum) revenue quantity num num_store, by (year quarter upc upc_ver_uc) fast
gen group=`g'
gen module=`m'
compress
if `i'==1 {
}
else {
append using `tr'
}
save `tr',replace
local i=`i'+1
}
}
}

save "INTERMEDIATE/upc_quarter_actual-balanced.dta",replace




****************************************************************
** 4. Data cleaning
****************************************************************

**
** 4-1. def10 - barcode
**

use  "INTERMEDIATE/upc_quarter_actual-balanced.dta", clear

// name_firm (name) firm (numeric code) should be generated once GS1 data is obtained

rename upc product
collapse (sum) revenue quantity num num_store ///
, by(product year quarter department group module)
drop if product==.

rename quarter qtr
egen quarter=group(year qtr)
gen t=yq(year,qtr)
format %tq t

compress

* Creates longitudinal variables 

* Defines cohort/entryTime, exitTime, age, and MaxAge
bysort product: egen x=min(quarter)
gen lcensored=(x==1)
gen cohort=x 
bysort product: egen y=max(quarter)
qui su y
gen rcensored=(y==`r(max)') 
gen exitTime=y 
gen age=quarter-cohort 
bysort product: egen maxAge=max(age) 
bysort product: egen totalobs=count(quarter)

* Type of observation 
gen type="NA"
replace type="entry"      if age==0 & age!=maxAge  
replace type="continues"  if age>0  & age<maxAge  
replace type="exit"       if age==maxAge & age!=0 
replace type="entry/exit" if age==maxAge & age==0 
gen censored="NA"  // NEW & CORRECTED
replace censored="left censored"   if lcensored==1   & rcensored==0
replace censored="right censored"  if rcensored==1 & lcensored==0
replace censored="left censored & right censored" if lcensored==1  & rcensored==1   
replace censored="not censored" if lcensored==0  & rcensored==0
tab type censored, m

gen flag_longitudinal="NA"
replace flag_longitudinal="Complete" if totalobs==y-x+1
replace flag_longitudinal="Incomplete" if totalobs<y-x+1

* Revenue variables
merge m:1 t year qtr using "$dir/BCindicators/cpi.dta", keep(1 3)
gen revenue_r=revenue/cpi
bys product: egen rr=mean(revenue_r) if age>0  & age<maxAge  
bys product: egen revenue_r_m=mean(rr)
drop rr

* Clean variables
	
* Cleaned data
keep product t year quarter qtr department group module name_firm firm brand brand_descr generic num num_store ///
cohort exitTime censored age maxAge type flag_longitudinal totalobs revenue quantity revenue_r revenue_r_m 
save "INTERMEDIATE/temp.dta", replace

* Rename exits
use "INTERMEDIATE/temp.dta", clear
tempfile exits
keep if type=="exit" | type=="entry/exit"
keep  product year qtr department group module name_firm firm brand brand_descr generic ///
cohort exitTime censored age maxAge type flag_longitudinal totalobs revenue_r_m
replace qtr=qtr+1 
replace year=year+1 if qtr==5
replace qtr=1 if qtr==5
egen quarter=group(year qtr)
replace quarter=quarter+1
gen t=yq(year,qtr)
format %tq t
replace type="exit"
save `exits'
use "INTERMEDIATE/temp.dta", clear
replace type="continues" if type=="exit"
replace type="entry" if type=="entry/exit" 
append using `exits'


* price
gen price=revenue/quantity

* Defines quality for each product
	* Baseline
		bysort quarter module: egen price_median_tm0=median(price) if type=="continues" // changes to exclude entrants and exists from median calculation
		bysort quarter module: egen price_median_tm=mean(price_median_tm0)
		drop price_median_tm0
		gen quality0=ln(price/price_median_tm)
		label variable quality0 "proxy for quality of good j of category c in quarter t" 
	* Percentile-based 
		sort  quarter module price
		bysort quarter module : gen  quality7  = 100*((_n-0.5)/_N) 
		label variable quality7 "proxy for quality percentile with regards to "
	

** sample A

save "INTERMEDIATE/product_RMS_def10_firm_sampleA_06_15.dta", replace

** sample E (Benchmark)

* Sample
	keep if flag_longitudinal=="Complete"
	keep if generic==0
	keep if department!=8 & department!=9

save "INTERMEDIATE/product_RMS_def10_firm_sampleE_06_15.dta", replace



